""" Clean up the contract dataset and get it ready to merge

"""

import pandas as pd
import utils

#%% SETUP CONFIGURATION -----------------------------------------------------------------
data_folder = 'data_py'
path = f'./{data_folder}/temp/'

well_to_well_names = [
    'well-to-well',
    'well to well',
    'well to well.',
    'well-to-well.',
    'well by well',
    'well-by-well',
    'month by month',
    '1well',
    '30-40 day extension',
    'well to well extension',
    '1 well.',
    'well x well'
]

multiplier_names = {
    'days': 1,
    'day': 1,
    'months': 30,
    'month': 30,
    'well': 40,
    'wells': 40,
    'wells.': 40,
    'year': 365,
    'years': 365,
}

keep_columns = [
    'rig_name',
    'contractor',
    'operator',
    'fixture_date',
    'day_rate',
    'start',
    'end',
    'description',
    'total_days_description',
    'type',
    'next_start',
    'end_by_duration',
    'duration',
    'turnkey'
]

#%% IMPORT DATA ---------------------------------------------------------------------
df = pd.read_excel(
    f'./{data_folder}/raw/contracts'
    f'/US GoM fixtures 1996 to May 2016 with Turnkey fixtures highlighted.xlsx',
    sheet_name='Fixtures US GoM'
)
df.columns = df.columns.str.lower().str.replace(' ', '_')
df['day_rate'] = pd.to_numeric(df['day_rate'], errors='coerce')
df['turnkey'] = df['turnkey'].fillna(0.0)

# Save deflator data
df_deflator = pd.read_csv('data_py/processed/deflator_daily.csv', index_col=[0])
df_deflator['date'] = pd.to_datetime(df_deflator['date'])

#%% CONVERT RAW DATA TO CLEANED CONTRACTS -------------------------------------------
# Set contract start = fixture date for renegotiations etc
df.loc[df['fixture_date'] >= df['contract_start'], 'contract_start'] = df['fixture_date']

# Clean up the dataframe
df = (
    df
    .rename({'contract_start': 'start', 'contract_description': 'description'}, axis=1)
    .query('rig_type == "Jackup"')
    .pipe(utils.to_csv_return, path + '04_clean_contracts/contracts_all.csv')
    .query('20000101 <= fixture_date <= 20151231')
    .assign(
        description=lambda x: x['description'].str.lower(),
        rig_name=lambda x: x['rig_name'].str.lower()
    )
    .dropna(subset=['day_rate'])  # Drop contracts missing day rate (check this is ok?)
).replace(well_to_well_names, '1 well')

# Deflate the dayrates
df = df.merge(
    df_deflator,
    left_on='fixture_date',
    right_on='date',
    how='left'
)

df['day_rate'] = df['day_rate'] / (df['gdpdef'] / 100)

#%% GET DURATION ------------------------------------------------------------------------
print("GET DURATION")

df = (
    df
    .sort_values(by=['rig_name', 'start'])
    .assign(
        total_days_description=lambda x: x.apply(
            utils.get_days_from_description,
            multiplier_names=multiplier_names,
            axis=1
        ),
        next_start=lambda x: x.groupby('rig_name')['start'].shift(-1),
        end=lambda x: x.apply(utils.get_end, axis=1),
        end_by_duration=
            lambda x: x['start'] + pd.to_timedelta(x['total_days_description'], unit='d')
    )
)

# When computing the actual duration of the contract I use a more restrictive
# definition of duration: end = min(description, next start):
df['end_min'] = df['end']
df.loc[df['end_min'] > df['end_by_duration'], 'end_min'] = df['end_by_duration']
df['duration'] = (df['end_min'] - df['start']).dt.days

#%% SAVE --------------------------------------------------------------------------------
df = (
    df.pipe(
        utils.to_csv_return,
        path + '04_clean_contracts/contracts_with_duration_full.csv'
    )
    .loc[:, keep_columns]
    .reset_index()
    .drop('index', axis=1)
    .query('end > start')
    .pipe(utils.to_csv_return, path + '04_clean_contracts/contracts_processed.csv')
)
